Q1
First I load the data files
library(tidyverse)
dat <- read.csv("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/migration2012.csv")
states <- read.csv("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/states_chord.csv")
dat <- dat %>%
tidyr::gather(key=From, value=value, Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,District.of.Columbia,Florida,Georgia,Hawaii,Idaho,Illinois,Indiana,Iowa,Kansas,Kentucky,Louisiana,Maine,Maryland,Massachusetts,Michigan,Minnesota,Mississippi,Missouri,Montana,Nebraska,Nevada,New.Hampshire,New.Jersey,New.Mexico,New.York,North.Carolina,North.Dakota,Ohio,Oklahoma,Oregon,Pennsylvania,Rhode.Island,South.Carolina,South.Dakota,Tennessee,Texas,Utah,Vermont,Virginia,Washington,West.Virginia,Wisconsin,Wyoming)
Delete the puctuations.
dat <- data.frame(apply(dat, 2, function(y) gsub("[[:punct:]]", " ", y)))
dat
Then I add two new columns to the above data frame.
d1 <- merge(dat, states, by.x="From", by.y="States")
d1 <- d1[,c(-5,-6)]
names(d1)[4] <- "ID_From"
d2 <- merge(d1, states, by.x="To", by.y="States")
d2 <- d2[,c(-6,-7)]
names(d2)[5] <- "ID_To"
dat <- d2
dat
Sort the data frame in the ascending order
dat <- arrange(dat, ID_From)
dat <- arrange(dat, ID_To)
dat
Convert the above data frame into the matrix form
t1 <- dat[,c(1,2,4,5)]
t1 <- t1 %>%
spread(key=To, value=ID_To)
t2 <- dat[,c(1,2,3,4)]
t2 <- t2 %>%
spread(key=To, value=value)
t3 <- rbind(t1[1,],t2)
t3$From <- as.character(t3$From)
t3[1,2] <- 0
t3[1,1] <- "ID_To"
t3 <- t3 %>%
arrange(ID_From)
library(data.table)
data.table 1.10.4
**********
This installation of data.table has not detected OpenMP support. It will still work but in single-threaded mode. If this a Mac and you obtained the Mac binary of data.table from CRAN, CRAN's Mac does not yet support OpenMP. In the meantime please follow our Mac installation instructions on the data.table homepage. If it works and you observe benefits from multiple threads as others have reported, please convince Simon Ubanek by sending him evidence and ask him to turn on OpenMP support when CRAN builds package binaries for Mac. Alternatives are to install Ubuntu on your Mac (which I have done and works well) or use Windows where OpenMP is supported and works well.
**********
The fastest way to learn (by data.table authors): https://www.datacamp.com/courses/data-analysis-the-data-table-way
Documentation: ?data.table, example(data.table) and browseVignettes("data.table")
Release notes, videos and slides: http://r-datatable.com
-----------------------------------------------------------------------------------------------------------------------------------
data.table + dplyr code now lives in dtplyr.
Please library(dtplyr)!
-----------------------------------------------------------------------------------------------------------------------------------
次のパッケージを付け加えます: ‘data.table’
以下のオブジェクトは ‘package:dplyr’ からマスクされています:
between, first, last
以下のオブジェクトは ‘package:purrr’ からマスクされています:
transpose
setcolorder(t3,c("From","ID_From","Connecticut","Maine","Massachusetts","New Hampshire","Rhode Island",
"Vermont","New Jersey","New York","Pennsylvania","Illinois","Indiana",
"Michigan","Ohio","Wisconsin","Iowa","Kansas","Minnesota",
"Missouri","Nebraska","North Dakota","South Dakota","Delaware","Florida",
"Georgia","Maryland","North Carolina","South Carolina","Virginia","District of Columbia",
"West Virginia","Alabama","Kentucky","Mississippi","Tennessee","Arkansas",
"Louisiana","Oklahoma","Texas","Arizona","Colorado","Idaho",
"Montana","Nevada","New Mexico","Utah","Wyoming","Alaska",
"California","Hawaii","Oregon","Washington"))
t3 <- t3[c(-1),c(-2)]
t4 <- t3[,-1]
rownames(t4) <- t3[,1]
t5 <- data.frame(apply(t4, 2, function(y) as.numeric(y)))
rownames(t5) <- t3[,1]
t5
Drawing the chord diagram
chorddiag(as.matrix(t5),groupColors=states$Color,showTicks=F,groupnamePadding = 20,groupThickness=.05,groupnameFontsize=10)
row names of the 'data' matrix differ from its column names or the 'groupNames' argument.
Q2
Read “Stops On Lines” and the all GIS data of bus lines.
library(dplyr)
library(sp)
library(rgdal)
library(leaflet)
library(ggmap)
# Bus Stops
SOL <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/BusStops1216","StopsOnLines1216")
SOL.pj <- spTransform(SOL, CRS("+proj=longlat +datum=WGS84"))
# Bus Routes
CC <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/ComCir1216","ComCir1216")
CC.pj <- spTransform(CC, CRS("+proj=longlat +datum=WGS84"))
LE <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/LimExp1216","LimExp1216")
LE.pj <- spTransform(LE, CRS("+proj=longlat +datum=WGS84"))
LCBD <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/LocalCBD1216","LocalCBD1216")
LCBD.pj <- spTransform(LCBD, CRS("+proj=longlat +datum=WGS84"))
LNCBD <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/LocalNonCBD1216","LocalNonCBD1216")
LNCBD.pj <- spTransform(LNCBD, CRS("+proj=longlat +datum=WGS84"))
RBRT <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/RapidBRT1216","RapidBRT1216")
RBRT.pj <- spTransform(RBRT, CRS("+proj=longlat +datum=WGS84"))
tmp_CC <- geometry(CC.pj)
tmp_LE <- geometry(LE.pj)
tmp_LCBD <- geometry(LCBD.pj)
tmp_LNCBD <- geometry(LNCBD.pj)
tmp_RBRT <- geometry(RBRT.pj)
# Individual Bus Route
pjs <- list()
tmps <- list()
layer_list <- ogrListLayers("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/Individuals1216")
for (i in layer_list){
if (i != 728){
a <- spTransform(readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/Individuals1216", toString(i)), CRS("+proj=longlat +datum=WGS84"))
pjs <- c(pjs, a)
tmps <- c(tmps, geometry(a))
}
}
make Line_list
tmps <- c(tmps, tmp_CC, tmp_LE, tmp_LCBD, tmp_LNCBD, tmp_RBRT)
Line_list <- list()
for (i in 1:length(tmps)){
for (j in 1:length(tmps[[i]])){
Line_list <- c(Line_list, tmps[[i]][j]@lines[[1]]@Lines)
}
}
make new_id
pjs <- c(pjs, CC.pj, LE.pj, LCBD.pj, LNCBD.pj, RBRT.pj)
LinLSs <- list()
for (i in 1:length(pjs)){
LinLSs <- c(LinLSs, sapply(pjs[[i]]@lines, function(x) length(x@Lines)))
}
LinLSs <- LinLSs %>% unlist()
new_id <- sapply(1:length(LinLSs), function(x) paste0(x, "_", seq.int(LinLSs[[x]]))) %>%
unlist()
make data frame
## make a new data.frame (only route_id)
DAT=data.frame(matrix(rep(NA,1),nrow=1))[-1,]
for (i in 1:length(pjs)){
df <- data.frame(route_id = pjs[[i]]@data$VAR_IDENT)
DAT <- rbind(DAT, df)
}
rownames(DAT) <- new_id
SLDF <- mapply(function(x, y) Lines(x, ID = y), x = Line_list, y = new_id) %>%
#list() %>%
SpatialLines() %>%
SpatialLinesDataFrame(data = DAT)
make new lines and LA map
dat <- geocode('Los Angels')
Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Los%20Angels&sensor=false
leaflet() %>%
setView(lng = dat['lon'], lat = dat['lat'] , zoom = 11) %>%
addPolylines(data = SLDF, color = "black", opacity = 1, weight = 1) %>%
addCircles(data=SOL.pj@data,~LONG, ~LAT, color = "red", weight = 0.3) %>%
addTiles()
Q3
First I load the data.
library(quantmod)
library(highcharter)
x <- getSymbols("AUD/JPY", src = "oanda", auto.assign = FALSE)
y <- getSymbols("GBP/USD", src = "oanda", auto.assign = FALSE)
Next make Bollinger’s bands for each exchange rate.
x.BBands.ll <- BBands(x)$dn
x.BBands.ul <- BBands(x)$up
x.BBands.m <- BBands(x)$mavg
y.BBands.ll <- BBands(y)$dn
y.BBands.ul <- BBands(y)$up
y.BBands.m <- BBands(y)$mavg
The drawing code is as follows.
hc <- highchart(type="stock") %>%
hc_title(text="Charting Exchange Rates") %>%
hc_subtitle(text = "Data extracted using quantmod package") %>%
hc_yAxis_multiples(
list(top = "0%", height = "50%", offset=0, opposite=TRUE),
list(top = "50%", height = "50%", offset=0, opposite=TRUE)
)%>%
hc_add_series(x, id = "audjpy",name ="audjpy", yAxis=0, color="blue", lineWidth=1.5) %>%
hc_add_series(x.BBands.ll, id = "audjpy.ll", name="audjpy Lower BBands",yAxis=0,
color="black",dashStyle='shortdash', lineWidth=1) %>%
hc_add_series(x.BBands.ul, id = "audjpy.ul", name="audjpy Upper BBands",yAxis=0,
color="black",lineWidth=1) %>%
hc_add_series(x.BBands.m, id = "audjpy.m",name="audjpy BBands MA", yAxis=0,
color="red",lineWidth=1) %>%
hc_add_series(y, id = "gbpusd",name="gbpusd",yAxis=1, color="green", lineWidth=1.5) %>%
hc_add_series(y.BBands.ll, id = "gbpusd.ll",name="gbpusd Lower BBands", yAxis=1,
color="black",dashStyle='shortdash',lineWidth=1) %>%
hc_add_series(y.BBands.ul, id = "gbpusd.ul",name="gbpusd Upper BBands", yAxis=1,
color="black",lineWidth=1) %>%
hc_add_series(y.BBands.m, id = "gbpusd.m",name="gbpusd BBands MA", yAxis=1,
color="red",lineWidth=1) %>%
hc_add_theme(hc_theme_538())
hc
Q4
Load libraries and check the raw data. And make ffdf after converting character columns to factor columns in original df.
library(nycflights13)
library(ffbase)
library(ffbase2)
library(biglm)
library(pROC)
library(chron)
tmp <- flights
tmp$carrier <- as.factor(tmp$carrier)
tmp$tailnum <- as.factor(tmp$tailnum)
tmp$origin <- as.factor(tmp$origin)
tmp$dest <- as.factor(tmp$dest)
flightff <- as.ffdf(tmp)
Next I make new columns as follows
flightff$Delay <- ffifelse(flightff$dep_delay > 0 | flightff$dep_delay == 0 , 1,0)
flightff$DepHour <- flightff$hour
flightff$Car <- ffifelse(flightff$carrier %in% as.factor(c("DL","US","DH","UA")), 1, 0)
flightff$Night <- ffifelse(flightff$hour > 18 | flightff$hour < 6, 1, 0)
flightff$Weekend <- ffifelse(day.of.week(month=flightff$month, day=flightff$day, year=flightff$year) == 6, 1, 0)
I exclude the rows whose Delay values are NA and rename it to logitff. And then I split the dataset into train set and test set.
logitff <- flightff[!is.na(flightff$Delay),]
indx <- ff(1:nrow(logitff))
p <- 0.7
trainIndx <- ff(indx[1:trunc(length(indx)*p)])
trainset <- logitff[trainIndx,]
testIndx <- ff(indx[(trunc(length(indx)*p)+1):length(indx)])
testset <- logitff[testIndx,]
Logistic regression
fit <- bigglm.ffdf(Delay~DepHour+Car+Night+Weekend, data = trainset, family=binomial(), sandwich=TRUE)
summary(fit)
Large data regression model: bigglm(Delay ~ DepHour + Car + Night + Weekend, data = trainset,
family = binomial(), sandwich = TRUE)
Sample size = 229964
Coef (95% CI) SE p
(Intercept) -1.5418 -1.5731 -1.5104 0.0157 0
DepHour 0.1020 0.0996 0.1044 0.0012 0
Car -0.0680 -0.0858 -0.0503 0.0089 0
Night -0.2619 -0.2912 -0.2326 0.0147 0
Weekend -0.1543 -0.1821 -0.1265 0.0139 0
Sandwich (model-robust) standard errors
predict and make confusionmatrix in train_set
train_pred <- predict(fit, newdata = trainset, type="response")
train_pred <- ifelse(train_pred>0.5, 1,0)
train_confusion <- table(as.integer(as.data.frame(trainset)$Delay), as.integer(train_pred))
train_confusion <- addmargins(train_confusion)
train_confusion
0 1 Sum
0 94128 36392 130520
1 56845 42599 99444
Sum 150973 78991 229964
predict and make confusionmatrix in test_set
test_pred <- predict(fit, newdata = testset, type="response")
test_pred <- ifelse(test_pred>0.5, 1,0)
test_confusion <- table(as.integer(as.data.frame(testset)$Delay), as.integer(test_pred))
test_confusion <- addmargins(test_confusion)
test_confusion
0 1 Sum
0 39997 13058 53055
1 25558 19944 45502
Sum 65555 33002 98557
Draw ROC curve
test_pred <- predict(fit, newdata = testset, type="response")
roc <- roc(as.integer(as.data.frame(testset)$Delay), as.numeric(test_pred))
plot(roc)

Q5
First I load the data. And before using spark I delete the irrelevant columns.
Remove the observations satisfying the condition
Split this data into trainset and testset.
test
$test
NA
Use Decision tree
decision_tree <- train %>%
ml_decision_tree(response="BOROUGH", features = c("LATITUDE","LONGITUDE"), max.bins = 200L, max.depth = 10L, seed=123L) %>%
Prediction
table(pred$BOROUGH, pred$prediction)
0 1 2 3 4
BRONX 0 0 0 8876 0
BROOKLYN 21181 0 24 0 1
MANHATTAN 0 17944 0 0 0
QUEENS 25 1 17777 0 0
STATEN ISLAND 23 0 0 0 3169
---
title: "Big Data Analytics Assignment 1"
output:
  html_notebook: default
  html_document: default
  pdf_document: default
---

## Q1

First I load the data files
```{r}
library(tidyverse)
dat <- read.csv("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/migration2012.csv")
states <- read.csv("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/states_chord.csv")

dat <- dat %>%
tidyr::gather(key=From, value=value, Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,District.of.Columbia,Florida,Georgia,Hawaii,Idaho,Illinois,Indiana,Iowa,Kansas,Kentucky,Louisiana,Maine,Maryland,Massachusetts,Michigan,Minnesota,Mississippi,Missouri,Montana,Nebraska,Nevada,New.Hampshire,New.Jersey,New.Mexico,New.York,North.Carolina,North.Dakota,Ohio,Oklahoma,Oregon,Pennsylvania,Rhode.Island,South.Carolina,South.Dakota,Tennessee,Texas,Utah,Vermont,Virginia,Washington,West.Virginia,Wisconsin,Wyoming)
```

Delete the puctuations.
```{r}
dat <- data.frame(apply(dat, 2, function(y) gsub("[[:punct:]]", " ", y)))
dat
```

Then I add two new columns to the above data frame.

```{r}
d1 <- merge(dat, states, by.x="From", by.y="States")
d1 <- d1[,c(-5,-6)]
names(d1)[4] <- "ID_From"

d2 <- merge(d1, states, by.x="To", by.y="States")
d2 <- d2[,c(-6,-7)]
names(d2)[5] <- "ID_To"
dat <- d2
dat
```

Sort the data frame in the ascending order

```{r}
dat <- arrange(dat, ID_From)
dat <- arrange(dat, ID_To)
dat
```

Convert the above data frame into the matrix form

```{r}
t1 <- dat[,c(1,2,4,5)]
t1 <- t1 %>%
  spread(key=To, value=ID_To)

t2 <- dat[,c(1,2,3,4)]
t2 <- t2 %>%
  spread(key=To, value=value)

t3 <- rbind(t1[1,],t2)
t3$From <- as.character(t3$From)
t3[1,2] <- 0
t3[1,1] <- "ID_To"
t3 <- t3 %>%
  arrange(ID_From)

library(data.table)
setcolorder(t3,c("From","ID_From","Connecticut","Maine","Massachusetts","New Hampshire","Rhode Island",
"Vermont","New Jersey","New York","Pennsylvania","Illinois","Indiana",
"Michigan","Ohio","Wisconsin","Iowa","Kansas","Minnesota",
"Missouri","Nebraska","North Dakota","South Dakota","Delaware","Florida",
"Georgia","Maryland","North Carolina","South Carolina","Virginia","District of Columbia",
"West Virginia","Alabama","Kentucky","Mississippi","Tennessee","Arkansas",
"Louisiana","Oklahoma","Texas","Arizona","Colorado","Idaho",
"Montana","Nevada","New Mexico","Utah","Wyoming","Alaska",
"California","Hawaii","Oregon","Washington"))

t3 <- t3[c(-1),c(-2)]

t4 <- t3[,-1]
rownames(t4) <- t3[,1]

t5 <- data.frame(apply(t4, 2, function(y) as.numeric(y)))
rownames(t5) <- t3[,1]
t5
```

Drawing the chord diagram
```{r}
library(chorddiag)

chorddiag(as.matrix(t5),groupColors=states$Color,showTicks=F,groupnamePadding=20,groupThickness=.05,groupnameFontsize=10)
```


## Q2

Read "Stops On Lines" and the all GIS data of bus lines.
```{r}
library(dplyr)
library(sp)
library(rgdal)
library(leaflet)
library(ggmap)

# Bus Stops
SOL <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/BusStops1216","StopsOnLines1216")
SOL.pj <- spTransform(SOL, CRS("+proj=longlat +datum=WGS84"))

# Bus Routes
CC <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/ComCir1216","ComCir1216")
CC.pj <- spTransform(CC, CRS("+proj=longlat +datum=WGS84"))
LE <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/LimExp1216","LimExp1216")
LE.pj <- spTransform(LE, CRS("+proj=longlat +datum=WGS84"))
LCBD <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/LocalCBD1216","LocalCBD1216")
LCBD.pj <- spTransform(LCBD, CRS("+proj=longlat +datum=WGS84"))
LNCBD <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/LocalNonCBD1216","LocalNonCBD1216")
LNCBD.pj <- spTransform(LNCBD, CRS("+proj=longlat +datum=WGS84"))
RBRT <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/RapidBRT1216","RapidBRT1216")
RBRT.pj <- spTransform(RBRT, CRS("+proj=longlat +datum=WGS84"))

tmp_CC <- geometry(CC.pj)
tmp_LE <- geometry(LE.pj)
tmp_LCBD <- geometry(LCBD.pj)
tmp_LNCBD <- geometry(LNCBD.pj)
tmp_RBRT <- geometry(RBRT.pj)

# Individual Bus Route
pjs <- list()
tmps <- list()
layer_list <- ogrListLayers("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/Individuals1216")
for (i in layer_list){
  if (i != 728){
    a <- spTransform(readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/Individuals1216", toString(i)), CRS("+proj=longlat +datum=WGS84"))
    pjs <- c(pjs, a)
    tmps <- c(tmps, geometry(a))
  }
}
```

make Line_list
```{r}
tmps <- c(tmps, tmp_CC, tmp_LE, tmp_LCBD, tmp_LNCBD, tmp_RBRT)

Line_list <- list()
for (i in 1:length(tmps)){
  for (j in 1:length(tmps[[i]])){
    Line_list <- c(Line_list, tmps[[i]][j]@lines[[1]]@Lines)
  }
}
```


make new_id
```{r}
pjs <- c(pjs, CC.pj, LE.pj, LCBD.pj, LNCBD.pj, RBRT.pj)
LinLSs <- list()
for (i in 1:length(pjs)){
 LinLSs <- c(LinLSs, sapply(pjs[[i]]@lines, function(x) length(x@Lines)))
}
LinLSs <- LinLSs %>% unlist()

new_id <- sapply(1:length(LinLSs), function(x) paste0(x, "_", seq.int(LinLSs[[x]]))) %>% 
  unlist()
```

make data frame
```{r}
## make a new data.frame (only route_id)
DAT=data.frame(matrix(rep(NA,1),nrow=1))[-1,]
for (i in 1:length(pjs)){
  df <- data.frame(route_id = pjs[[i]]@data$VAR_IDENT)
  DAT <- rbind(DAT, df)
}
rownames(DAT) <- new_id

SLDF <- mapply(function(x, y) Lines(x, ID = y), x = Line_list, y = new_id) %>%
  #list() %>%
  SpatialLines() %>% 
  SpatialLinesDataFrame(data = DAT)
```


make new lines and LA map
```{r}
dat <- geocode('Los Angels')

leaflet() %>%
  setView(lng = dat['lon'], lat = dat['lat']	, zoom = 11) %>%
  addPolylines(data = SLDF, color = "black", opacity = 1, weight = 1) %>% 
  addCircles(data=SOL.pj@data,~LONG, ~LAT, color = "red", weight = 0.3) %>%
  addTiles()
```


## Q3

First I load the data.
```{r}
library(quantmod)
library(highcharter)

x <- getSymbols("AUD/JPY", src = "oanda", auto.assign = FALSE)
y <- getSymbols("GBP/USD", src = "oanda", auto.assign = FALSE)
```

Next make Bollinger's bands for each exchange rate.
```{r}
x.BBands.ll <- BBands(x)$dn
x.BBands.ul <- BBands(x)$up
x.BBands.m <- BBands(x)$mavg
y.BBands.ll <- BBands(y)$dn
y.BBands.ul <- BBands(y)$up
y.BBands.m <- BBands(y)$mavg
```

The drawing code is as follows.
```{r}
hc <- highchart(type="stock") %>% 
  hc_title(text="Charting Exchange Rates") %>% 
  hc_subtitle(text = "Data extracted using quantmod package") %>% 
  hc_yAxis_multiples(
    list(top = "0%", height = "50%", offset=0, opposite=TRUE),
    list(top = "50%", height = "50%", offset=0, opposite=TRUE)
  )%>%
  hc_add_series(x, id = "audjpy",name ="audjpy", yAxis=0, color="blue", lineWidth=1.5) %>%
  hc_add_series(x.BBands.ll, id = "audjpy.ll", name="audjpy Lower BBands",yAxis=0,
                color="black",dashStyle='shortdash', lineWidth=1) %>%
  hc_add_series(x.BBands.ul, id = "audjpy.ul", name="audjpy Upper BBands",yAxis=0,
                color="black",lineWidth=1) %>%
  hc_add_series(x.BBands.m, id = "audjpy.m",name="audjpy BBands MA", yAxis=0,
                color="red",lineWidth=1) %>%
  hc_add_series(y, id = "gbpusd",name="gbpusd",yAxis=1, color="green", lineWidth=1.5) %>%
  hc_add_series(y.BBands.ll, id = "gbpusd.ll",name="gbpusd Lower BBands", yAxis=1,
                color="black",dashStyle='shortdash',lineWidth=1) %>%
  hc_add_series(y.BBands.ul, id = "gbpusd.ul",name="gbpusd Upper BBands", yAxis=1,
                color="black",lineWidth=1) %>%
  hc_add_series(y.BBands.m, id = "gbpusd.m",name="gbpusd BBands MA", yAxis=1,
                color="red",lineWidth=1) %>%
  hc_add_theme(hc_theme_538())

hc
```


## Q4

Load libraries and check the raw data. And make ffdf after converting character columns to factor columns in original df.
```{r}
library(nycflights13)
library(ffbase)
library(ffbase2)
library(biglm)
library(pROC)
library(chron)

tmp <- flights
tmp$carrier <- as.factor(tmp$carrier)
tmp$tailnum <- as.factor(tmp$tailnum)
tmp$origin <- as.factor(tmp$origin)
tmp$dest <- as.factor(tmp$dest)

flightff <- as.ffdf(tmp)
```

Next I make new columns as follows
```{r}
flightff$Delay <- ffifelse(flightff$dep_delay > 0 | flightff$dep_delay == 0 , 1,0)
flightff$DepHour <- flightff$hour
flightff$Car <- ffifelse(flightff$carrier %in% as.factor(c("DL","US","DH","UA")), 1, 0)
flightff$Night <- ffifelse(flightff$hour > 18 | flightff$hour < 6, 1, 0)
flightff$Weekend <- ffifelse(day.of.week(month=flightff$month, day=flightff$day, year=flightff$year) == 6, 1, 0)
```

I exclude the rows whose Delay values are NA and rename it to logitff.
And then I split the dataset into train set and test set.
```{r}
logitff <- flightff[!is.na(flightff$Delay),]

indx <- ff(1:nrow(logitff))
p <- 0.7
trainIndx <- ff(indx[1:trunc(length(indx)*p)])
trainset <- logitff[trainIndx,]
testIndx <- ff(indx[(trunc(length(indx)*p)+1):length(indx)])
testset <- logitff[testIndx,]
```

Logistic regression 
```{r}
fit <- bigglm.ffdf(Delay~DepHour+Car+Night+Weekend, data = trainset, family=binomial(), sandwich=TRUE)
summary(fit)
```

predict and make confusionmatrix in train_set
```{r}
train_pred <- predict(fit, newdata = trainset, type="response")
train_pred <- ifelse(train_pred>0.5, 1,0)
train_confusion <- table(as.integer(as.data.frame(trainset)$Delay), as.integer(train_pred))
train_confusion <- addmargins(train_confusion)
train_confusion
```

predict and make confusionmatrix in test_set
```{r}
test_pred <- predict(fit, newdata = testset, type="response")
test_pred <- ifelse(test_pred>0.5, 1,0)
test_confusion <- table(as.integer(as.data.frame(testset)$Delay), as.integer(test_pred))
test_confusion <- addmargins(test_confusion)
test_confusion
```

Draw ROC curve
```{r}
test_pred <- predict(fit, newdata = testset, type="response")
roc <- roc(as.integer(as.data.frame(testset)$Delay), as.numeric(test_pred))
plot(roc)
```


## Q5

First I load the data. And before using spark I delete the irrelevant columns.
```{r}
library(sparklyr)
library(dplyr)
library(readr)
sc <- spark_connect(master = "local")

# 元のcsvに問題があるっぽい？ → Unspecifiedが怒られてるっぽい　→　Unspecifiedを0に変えるか。→今度はPASSENGER VEHICKEが怒られる。
# 要するにStringが全部ダメっぽい。でもtitanicではStringも上手く渡せてる。なぜ？
# 二つの例から空白は別に問題ではないことがわかる。
dat <- read_csv("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/NYPD_Motor_Vehicle_Collisions.csv")
dat <- dat[,c("BOROUGH","LATITUDE","LONGITUDE","UNIQUE KEY")]
nypd_tbl <- copy_to(sc, dat, "nypd_tbl",overwrite = TRUE)
```

Remove the observations satisfying the condition
```{r}
nypd_tbl <- nypd_tbl %>%
  filter(BOROUGH!="",!is.na(LATITUDE),!is.na(LONGITUDE),LATITUDE!=0,LONGITUDE!=0)
nypd_tbl
```

Split this data into trainset and testset.
```{r}
partitions <- nypd_tbl %>%
  sdf_partition(training = 0.9, test = 0.1, seed = 123)
train <- partitions[1]$training
test <- partitions[2]$test
```

Use Decision tree
```{r}
decision_tree <- train %>%
  ml_decision_tree(response="BOROUGH", features = c("LATITUDE","LONGITUDE"), max.bins = 200L, max.depth = 10L, seed=123L) %>%
```

Prediction
```{r}
pred <- sdf_predict(decision_tree, test) %>%
  collect

table(pred$BOROUGH, pred$prediction)
```
